# This script uses the post- and arrest-level data to create two data sets:
# One where each observation is an exposure (used to compute overexposure) and 
# one where each observation is an instance of reporting (used to compute overreporting).
# Note that the overexposure file is very large (close to 30GB).

library(tidyverse)
library(lubridate)
library(geosphere)

dta_ct = read_csv("post_data.csv")
distances = read.csv("location_data.csv")

# Create distance matrix between ori9s from based on their location
dm = distm(distances[,c('lng', 'lat')])
rownames(dm) = distances$ori9
colnames(dm) = distances$ori9
dm = dm/1000
mode(dm) = 'integer'
dm = data.frame(dm)

dm = dm %>%
  add_rownames(var='ori9') %>%
  gather(agency, d, -ori9, na.rm=T) 
dm = dm %>%
  filter(!is.na(d))
dta_ct_match = dta_ct %>%
  filter(ori9 %in% distances$ori9)
rm(distances)
rm(dta_ct)

dta_ct_match = left_join(dta_ct_match, dm, by='ori9')

dta_ct_match = dta_ct_match %>%
  filter(d <= 483)

dta_arrest = read_csv("arrest_data.csv")
dta = bind_rows(dta_ct_match, dta_arrest)
rm(dta_ct_match)

dta[dta$type=="arrest",]$agency = dta[dta$type=="arrest",]$ori9
dta[dta$type=="arrest",]$d = 0
# Store exposure data
save(dta, file="combined_dta_long_483.rdata")

# Store reporting data
dta_single = dta %>%
  filter(ori9 == agency) %>%
  select(-agency, -d)
save(dta_single, file="dta_single.rdata")


